In [1]:
import pandas as pd,numpy as np,matplotlib.pyplot as plt,seaborn as sns
In [2]:
df=pd.read_csv(r"C:\Users\alasy\OneDrive\Desktop\final_df_olist1.csv")
df
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\334602695.py:1: DtypeWarning: Columns (11,12,13,14) have mixed types. Specify dtype option on import or set low_memory=False.
  df=pd.read_csv(r"C:\Users\alasy\OneDrive\Desktop\final_df_olist1.csv")
Out[2]:
Unnamed: 0 order_id order_item_id product_id seller_id shipping_limit_date price freight_value product_name_lenght product_description_lenght ... payment_type payment_installments payment_value product_category_name_english seller_lat seller_lng customer_lat customer_lng product_category cust_Region
0 0 00010242fe8c5a6d1ba2dd792cb16214 1.0 4244733e06e7ecb4970a6e2683c13e61 48436dade18ac8b2bce089ec2a041202 2017-09-19 09:45:35 58.90 13.29 58.0 598.0 ... credit_card 2.0 72.19 cool_stuff -22.497188 -44.127324 -21.763186 -41.310265 Fashion Southeast
1 1 5c94ad4e194c0e6794688a9d2b9ea94b 1.0 e0cf79767c5b016251fe139915c59a26 da8622b14eb17ae2831f4ac5b9dab84a 2018-03-16 18:50:24 29.90 18.23 55.0 388.0 ... boleto 1.0 48.13 health_beauty -22.708485 -47.664918 -21.763186 -41.310265 Health and Beauty Southeast
2 2 bd31b009e1dbc47fc7c250b1e2cf5440 1.0 92e2d2146e433cd4d1f09a3f8633ead0 4a3ca9315b744ce9f8e9374361493884 2018-05-02 03:51:18 52.90 22.95 53.0 714.0 ... boleto 1.0 75.85 bed_bath_table -21.757225 -48.829541 -21.763186 -41.310265 Furniture Southeast
3 3 bb4eb0196897c20281a61f75ce23211c 1.0 8cee1d824765335c48ccc515c2ecf4b8 4a3ca9315b744ce9f8e9374361493884 2017-07-28 14:50:11 96.00 15.42 54.0 385.0 ... credit_card 9.0 111.42 bed_bath_table -21.757225 -48.829541 -21.763186 -41.310265 Furniture Southeast
4 4 f9847bf9cc7336c6ba07fe2bdbb6cae1 1.0 3e5201fe0d1ba474d9b90152c83c706c 8160255418d5aaa7dbdc9f4c64ebda44 2018-05-16 18:30:50 125.90 13.30 56.0 255.0 ... credit_card 6.0 139.20 bed_bath_table -21.757225 -48.829541 -21.763186 -41.310265 Furniture Southeast
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
112633 117720 bb05bd3bbacf1e3c6026b43b44a6631c 1.0 a0425426bd812474106a782b2979e310 eb1bf309f4f6af9a97e9ddac3fdbeabe 2018-05-29 02:55:03 22.30 8.88 57.0 1417.0 ... boleto 1.0 31.18 sports_leisure -23.702611 -46.596395 -23.570953 -46.624567 Entertainment Southeast
112634 117721 c003c1face3c31cba0af4ab5314f3332 1.0 dedb5adefe63796ce6edd624d0ebd8a8 70ee69419916dc13e3aed619061f996c 2017-08-18 11:05:09 45.00 8.72 43.0 642.0 ... credit_card 5.0 53.72 furniture_decor -23.520595 -46.178266 -23.456257 -46.937905 Furniture Southeast
112635 117725 cec79ef92819a9f1f2009e1d8cf26db8 1.0 ae8a92195eb75e94998a210eeb51693e 1d139e3a3b14025640d8df1b230aace0 2017-09-19 12:35:16 7.48 15.10 43.0 513.0 ... credit_card 2.0 22.58 sports_leisure -21.190973 -48.156353 -17.086066 -42.255281 Entertainment Southeast
112636 117726 e9613fa6e02ff8ae45f37ca30b0468a1 1.0 f619f125af192e28da946f3382e783a6 9b00cad94ef3078faf6fba2e792c158f 2017-12-27 12:17:28 354.90 18.92 53.0 234.0 ... voucher 1.0 250.00 sports_leisure -15.801399 -43.310325 -27.101374 -51.246351 Entertainment South
112637 117727 e9613fa6e02ff8ae45f37ca30b0468a1 1.0 f619f125af192e28da946f3382e783a6 9b00cad94ef3078faf6fba2e792c158f 2017-12-27 12:17:28 354.90 18.92 53.0 234.0 ... credit_card 5.0 123.82 sports_leisure -15.801399 -43.310325 -27.101374 -51.246351 Entertainment South

112638 rows × 46 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112638 entries, 0 to 112637
Data columns (total 46 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     112638 non-null  int64  
 1   order_id                       112638 non-null  object 
 2   order_item_id                  112638 non-null  float64
 3   product_id                     112638 non-null  object 
 4   seller_id                      112638 non-null  object 
 5   shipping_limit_date            112638 non-null  object 
 6   price                          112638 non-null  float64
 7   freight_value                  112638 non-null  float64
 8   product_name_lenght            112638 non-null  float64
 9   product_description_lenght     112638 non-null  float64
 10  product_photos_qty             112638 non-null  float64
 11  product_weight_g               112638 non-null  object 
 12  product_length_cm              112638 non-null  object 
 13  product_height_cm              112638 non-null  object 
 14  product_width_cm               112638 non-null  object 
 15  seller_zip_code_prefix         112638 non-null  float64
 16  seller_city                    112638 non-null  object 
 17  seller_state                   112638 non-null  object 
 18  customer_id                    112638 non-null  object 
 19  order_status                   112638 non-null  object 
 20  order_purchase_timestamp       112638 non-null  object 
 21  order_approved_at              112638 non-null  object 
 22  order_delivered_carrier_date   112638 non-null  object 
 23  order_delivered_customer_date  112638 non-null  object 
 24  order_estimated_delivery_date  112638 non-null  object 
 25  customer_unique_id             112638 non-null  object 
 26  customer_zip_code_prefix       112638 non-null  int64  
 27  customer_city                  112638 non-null  object 
 28  customer_state                 112638 non-null  object 
 29  review_id                      112638 non-null  object 
 30  review_score                   112638 non-null  float64
 31  review_comment_title           112638 non-null  object 
 32  review_comment_message         112638 non-null  object 
 33  review_creation_date           112638 non-null  object 
 34  review_answer_timestamp        112638 non-null  object 
 35  payment_sequential             112638 non-null  float64
 36  payment_type                   112638 non-null  object 
 37  payment_installments           112638 non-null  float64
 38  payment_value                  112638 non-null  float64
 39  product_category_name_english  112638 non-null  object 
 40  seller_lat                     112638 non-null  float64
 41  seller_lng                     112638 non-null  float64
 42  customer_lat                   112638 non-null  float64
 43  customer_lng                   112638 non-null  float64
 44  product_category               112638 non-null  object 
 45  cust_Region                    112638 non-null  object 
dtypes: float64(15), int64(2), object(29)
memory usage: 39.5+ MB
In [4]:
df=df.drop(columns='Unnamed: 0',axis=1)
In [5]:
df.describe()
Out[5]:
order_item_id price freight_value product_name_lenght product_description_lenght product_photos_qty seller_zip_code_prefix customer_zip_code_prefix review_score payment_sequential payment_installments payment_value seller_lat seller_lng customer_lat customer_lng
count 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000 112638.000000
mean 1.195289 119.890695 19.994579 48.790453 784.760658 2.201673 24534.728049 35009.119418 4.082281 1.091106 2.941929 171.624030 -22.793489 -47.249482 -21.244659 -46.205792
std 0.688433 181.346106 15.724898 10.020038 650.963105 1.713659 27658.740625 29856.092601 1.346116 0.688239 2.778200 264.617854 2.703788 2.345126 5.555091 4.039370
min 1.000000 0.850000 0.000000 5.000000 4.000000 1.000000 1001.000000 1003.000000 1.000000 1.000000 0.000000 0.000000 -32.075303 -63.893789 -33.689890 -72.668821
25% 1.000000 39.900000 13.072500 42.000000 345.000000 1.000000 6429.000000 11095.000000 4.000000 1.000000 1.000000 60.772500 -23.609666 -48.829541 -23.591175 -48.109939
50% 1.000000 74.900000 16.310000 52.000000 600.000000 1.000000 13690.000000 24230.000000 5.000000 1.000000 2.000000 107.900000 -23.424361 -46.756262 -22.929004 -46.632945
75% 1.000000 133.000000 21.180000 57.000000 982.000000 3.000000 28035.000000 58400.000000 5.000000 1.000000 4.000000 189.157500 -21.757225 -46.522432 -20.197823 -43.658299
max 21.000000 6735.000000 409.680000 76.000000 3992.000000 20.000000 99730.000000 99980.000000 5.000000 26.000000 24.000000 13664.080000 -2.503158 -34.855831 42.184003 -8.723762
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112638 entries, 0 to 112637
Data columns (total 45 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   order_id                       112638 non-null  object 
 1   order_item_id                  112638 non-null  float64
 2   product_id                     112638 non-null  object 
 3   seller_id                      112638 non-null  object 
 4   shipping_limit_date            112638 non-null  object 
 5   price                          112638 non-null  float64
 6   freight_value                  112638 non-null  float64
 7   product_name_lenght            112638 non-null  float64
 8   product_description_lenght     112638 non-null  float64
 9   product_photos_qty             112638 non-null  float64
 10  product_weight_g               112638 non-null  object 
 11  product_length_cm              112638 non-null  object 
 12  product_height_cm              112638 non-null  object 
 13  product_width_cm               112638 non-null  object 
 14  seller_zip_code_prefix         112638 non-null  float64
 15  seller_city                    112638 non-null  object 
 16  seller_state                   112638 non-null  object 
 17  customer_id                    112638 non-null  object 
 18  order_status                   112638 non-null  object 
 19  order_purchase_timestamp       112638 non-null  object 
 20  order_approved_at              112638 non-null  object 
 21  order_delivered_carrier_date   112638 non-null  object 
 22  order_delivered_customer_date  112638 non-null  object 
 23  order_estimated_delivery_date  112638 non-null  object 
 24  customer_unique_id             112638 non-null  object 
 25  customer_zip_code_prefix       112638 non-null  int64  
 26  customer_city                  112638 non-null  object 
 27  customer_state                 112638 non-null  object 
 28  review_id                      112638 non-null  object 
 29  review_score                   112638 non-null  float64
 30  review_comment_title           112638 non-null  object 
 31  review_comment_message         112638 non-null  object 
 32  review_creation_date           112638 non-null  object 
 33  review_answer_timestamp        112638 non-null  object 
 34  payment_sequential             112638 non-null  float64
 35  payment_type                   112638 non-null  object 
 36  payment_installments           112638 non-null  float64
 37  payment_value                  112638 non-null  float64
 38  product_category_name_english  112638 non-null  object 
 39  seller_lat                     112638 non-null  float64
 40  seller_lng                     112638 non-null  float64
 41  customer_lat                   112638 non-null  float64
 42  customer_lng                   112638 non-null  float64
 43  product_category               112638 non-null  object 
 44  cust_Region                    112638 non-null  object 
dtypes: float64(15), int64(1), object(29)
memory usage: 38.7+ MB
In [7]:
df.isnull().sum()
Out[7]:
order_id                         0
order_item_id                    0
product_id                       0
seller_id                        0
shipping_limit_date              0
price                            0
freight_value                    0
product_name_lenght              0
product_description_lenght       0
product_photos_qty               0
product_weight_g                 0
product_length_cm                0
product_height_cm                0
product_width_cm                 0
seller_zip_code_prefix           0
seller_city                      0
seller_state                     0
customer_id                      0
order_status                     0
order_purchase_timestamp         0
order_approved_at                0
order_delivered_carrier_date     0
order_delivered_customer_date    0
order_estimated_delivery_date    0
customer_unique_id               0
customer_zip_code_prefix         0
customer_city                    0
customer_state                   0
review_id                        0
review_score                     0
review_comment_title             0
review_comment_message           0
review_creation_date             0
review_answer_timestamp          0
payment_sequential               0
payment_type                     0
payment_installments             0
payment_value                    0
product_category_name_english    0
seller_lat                       0
seller_lng                       0
customer_lat                     0
customer_lng                     0
product_category                 0
cust_Region                      0
dtype: int64
In [8]:
#plt.figure(figsize=(10,10))
#sns.pairplot(df)
In [9]:
plt.figure(figsize=(10,10))
sns.countplot(x ='customer_state', data=df,order=df['customer_state'].value_counts().sort_values().index)
Out[9]:
<AxesSubplot:xlabel='customer_state', ylabel='count'>

feature engineering with date time¶

In [10]:
from datetime import datetime
In [11]:
df['delivered_time']=pd.to_datetime(df['order_delivered_customer_date'],
                                            format='%Y-%m-%d').dt.date
df['estimate_time']=pd.to_datetime(df['order_estimated_delivery_date'], 
                                           format='%Y-%m-%d').dt.date

#Delivered time and Estimate time features created
In [12]:
df['weekly']=pd.to_datetime(df['order_delivered_customer_date'],
                                    format='%Y-%m-%d').dt.week


## created weekly feature based on order delivered customer date
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\3795791945.py:1: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
  df['weekly']=pd.to_datetime(df['order_delivered_customer_date'],
In [13]:
df['yearly']=pd.to_datetime(df['order_delivered_customer_date'])\
                    .dt.to_period('M')
df['yearly']= df['yearly'].astype(str)

##craeted yearly based on order delivered customer date
In [14]:
df['diff_days']= df['delivered_time']-df['estimate_time']
df['diff_days']= df['diff_days'].dt.days

##finding different days of delivered and estimated deliveries
In [15]:
plt.figure(figsize=(20,10))
sns.lineplot(x='weekly', y='diff_days', data=df,estimator='mean')

plt.xlabel("Weeks")
plt.ylabel("Difference Days")
plt.title("Average Difference Days per Week")
Out[15]:
Text(0.5, 1.0, 'Average Difference Days per Week')
In [16]:
## from the above graph we can understand that the earlies deliveries inthe month of october as compare to other moths in the year
## as we can also see that latest deliveries are done in the month of june to august in whole year 

customers purchased top 10 products¶

In [17]:
len(df['product_id'].unique())
Out[17]:
31371
In [18]:
len(df['product_id'].str[-18:].unique())
Out[18]:
31371
In [19]:
df['product_id_']=df['product_id'].str[-18:]
In [20]:
plt.figure(figsize=(20,10))
sns.countplot(x='product_id', data=df, palette='gist_earth',
             order=df['product_id'].value_counts()[:10]\
             .sort_values().index).set_title("Top 10 Products", fontsize=15,
                                             weight='bold')
Out[20]:
Text(0.5, 1.0, 'Top 10 Products')
In [21]:
## the above graph shows the top 10 produts , Because of the data information , we can't say the what are the products ,
## but we can assume that  which category they belong to
In [22]:
df.groupby(["product_category_name_english"])["product_id_"].count().sort_values(ascending=False).head(10)
Out[22]:
product_category_name_english
bed_bath_table           11657
health_beauty             9721
sports_leisure            8718
furniture_decor           8534
computers_accessories     7901
housewares                7136
watches_gifts             6006
telephony                 4547
garden_tools              4440
auto                      4249
Name: product_id_, dtype: int64
In [23]:
plt.figure(figsize=(15,15))
plt.title("Counts of all products")
plt.xticks(rotation='vertical')
sns.countplot(x='product_category_name_english',data=df)
## countplot of products
Out[23]:
<AxesSubplot:title={'center':'Counts of all products'}, xlabel='product_category_name_english', ylabel='count'>
In [24]:
group_category=df.groupby(['product_id_','product_category_name_english',])['product_id_']\
                                        .count().sort_values(ascending=False).head(10)
group_category
Out[24]:
product_id_         product_category_name_english
7b8ebd4e68314663af  furniture_decor                  524
2990de24d770e7f83d  garden_tools                     506
965c36a24e339b6058  bed_bath_table                   506
43d311335e499d9c6b  garden_tools                     401
8016ad823897a372db  garden_tools                     392
bb87a079a1f1519f73  garden_tools                     389
f6b889a5c7c61f2ac4  computers_accessories            343
41585e8d54d6772e08  watches_gifts                    322
2203795c972e5804a6  health_beauty                    283
5c781a9191c1e95ad7  computers_accessories            275
Name: product_id_, dtype: int64
In [25]:
## bygroupby seeing this the product_id and product_category the most most ordered product is from graden_tools and 
## others mostly from bed_bath_table

top 20 seller produts¶

In [26]:
len(df['seller_id'].unique())
Out[26]:
2900
In [27]:
df['seller_id_']=df['seller_id'].str[-6:]
In [28]:
plt.figure(figsize=(20,10))
df['seller_id_'].value_counts()[:10].plot.pie(autopct='%1.1f%%')
plt.title("Top 10 Seller",size=14, weight='bold')

##below pie chart shows the top 10 sellers and top 3 sellers have the closer proportions
Out[28]:
Text(0.5, 1.0, 'Top 10 Seller')
In [29]:
##Assuming for the orders' product category of these sellers, we can use 'product category' values.
##Below table shows the Top 10 sellers category, and since they can sell multiple product types, garden tools are the most 
##selling product of the best seller. 
In [30]:
seller_category= df.groupby(['seller_id_', 'product_category_name_english'])\
                        ['seller_id_'].count().sort_values(ascending=False).head(10)
seller_category
Out[30]:
seller_id_  product_category_name_english
523100      garden_tools                     1944
493884      bed_bath_table                   1652
7e94c0      watches_gifts                    1648
dab84a      bed_bath_table                   1373
0e0bfa      furniture_decor                  1315
b010ab      office_furniture                 1303
f13abc      telephony                        1188
3ad736      cool_stuff                       1099
b7556a      health_beauty                    1091
3b52b2      watches_gifts                    1010
Name: seller_id_, dtype: int64
In [31]:
f, (ax1, ax2) = plt.subplots(2, 1, figsize=(20,15))
group_category.plot.barh(ax=ax1, cmap='summer')
seller_category.plot.barh(ax=ax2, cmap='autumn')

ax1.set_title('Top10 Product', fontweight='bold')
ax2.set_title('Top10 Seller', fontweight='bold')

ax1.set_xlabel('Count', fontsize=15)
ax1.set_ylabel('Product Name', fontsize=15)
ax1.xaxis.set_tick_params(labelsize=12)
ax1.yaxis.set_tick_params(labelsize=15)

ax2.set_xlabel('Count', fontsize=15)
ax2.set_ylabel('Product Name', fontsize=15)
ax2.xaxis.set_tick_params(labelsize=12)
ax2.yaxis.set_tick_params(labelsize=15)
In [32]:
## Above graphs show the top products and top sellers category. 
##While most selling product item belongs to the furniture_decor, most seller item belongs to garden tools.
In [33]:
## now we can examine the most selling item weekely purchased items  which is 493884 and belong to bed_bath_table
In [34]:
df['order_week']= pd.to_datetime(df['order_purchase_timestamp'],
                                            format='%Y-%m-%d').dt.week
C:\Users\alasy\AppData\Local\Temp\ipykernel_4984\209791676.py:1: FutureWarning: Series.dt.weekofyear and Series.dt.week have been deprecated. Please use Series.dt.isocalendar().week instead.
  df['order_week']= pd.to_datetime(df['order_purchase_timestamp'],
In [35]:
#plt.figure(figsize=(20,12))
#sns.countplot('order_week', data=df[df['product_id_']== 
#                                        '493884'])
In [36]:
## Now we examine the most selling product category by weekly
In [37]:
#Weekly popular items 
items_weekly =df.groupby(['order_week','product_category_name_english'])\
                        ['product_category_name_english'].count().sort_values(ascending=False)
#Change to data frame
most_products= items_weekly.reset_index(name='count')
In [38]:
#Find the max value of row
max_selling_products= most_products[most_products['count'] 
                            == most_products.groupby(['order_week'])\
                            ['count'].transform(max)]
max_selling_products.head(10)
Out[38]:
order_week product_category_name_english count
0 47 bed_bath_table 445
1 32 bed_bath_table 388
3 29 bed_bath_table 366
4 33 bed_bath_table 354
5 23 bed_bath_table 349
7 8 computers_accessories 338
9 31 health_beauty 318
10 19 health_beauty 316
11 20 bed_bath_table 315
12 9 computers_accessories 314
In [39]:
max_selling_products['product_category_name_english'].value_counts()
Out[39]:
bed_bath_table           35
health_beauty             8
computers_accessories     3
sports_leisure            3
furniture_decor           2
housewares                1
toys                      1
Name: product_category_name_english, dtype: int64
In [40]:
print("Number of Unique Products = ",len(df['product_category_name_english'].unique()))
Number of Unique Products =  71
In [41]:
from wordcloud import WordCloud
a = list(df['product_category_name_english'])
word = [x for x in a if str(x) != 'nan']

word = " ".join(word)
wordcloud = WordCloud(width = 1200, height = 800, 
                background_color ='white',  
                min_font_size = 10).generate(word)                         
plt.figure(figsize = (10, 10), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

payments¶

In [42]:
## first we drop the most irrelavant colums that to make data more helpful to understand
payments= df.drop(columns=['product_name_lenght','product_description_lenght',
                                 'product_photos_qty','product_weight_g','product_length_cm',
                                 'product_height_cm','product_width_cm'])
In [43]:
price_details= df.groupby(['order_id','price','product_category_name_english',
                                 'yearly','weekly'])[['freight_value','payment_value']].sum().reset_index()
In [44]:
## As written on the data description,total order value an be calculated by sum of price and freight value
In [45]:
price_details['total_order_value'] = price_details['price'] + price_details['freight_value']
In [46]:
## We can calculate Gross Profit and Profit Margin by payment value and total order value
In [47]:
price_details['gross_profit'] = price_details['payment_value']- price_details['total_order_value']
price_details['profit_margin'] = price_details['gross_profit']/price_details['payment_value']
price_details['profit_margin'] = price_details['profit_margin'].astype('int64')
In [48]:
price_details.sort_values('gross_profit', ascending=False).head(10)
Out[48]:
order_id price product_category_name_english yearly weekly freight_value payment_value total_order_value gross_profit profit_margin
1422 03caa2c082116e1d31e67e9ae3700499 1680.00 fixed_telephony 2017-10 42 224.08 109312.64 1904.08 107408.56 0
10198 1b15974a0141d54e36626dca3fdc731a 100.00 computers_accessories 2018-03 10 202.40 44048.00 302.40 43745.60 0
16693 2cc9089445046817a7539d90805e6e5a 989.10 agro_industry_and_commerce 2017-12 50 146.94 36489.24 1136.04 35353.20 0
87914 e8fa22c3673b1dd17ea315021b1f0f61 284.99 drinks 2018-05 18 168.70 30186.00 453.69 29732.31 0
43473 736e1922ae60d0d6a89247b851902527 1790.00 fixed_telephony 2018-07 30 114.88 29099.52 1904.88 27194.64 0
24977 428a2f660dc84138d969ccd69a0ab6d5 65.49 furniture_decor 2017-12 50 243.30 18384.75 308.79 18075.96 0
21838 3a213fcdfe7d98be74ea0dc05a8b31ae 108.00 watches_gifts 2018-01 4 186.24 17786.88 294.24 17492.64 0
93511 f80549a97eb203e1566e026ab66f045b 137.90 computers_accessories 2017-09 39 388.10 17671.00 526.00 17145.00 0
25560 4412d97cb2093633afa85f11db46316c 120.00 computers_accessories 2018-06 23 83.99 15978.65 203.99 15774.66 0
71708 be382a9e1ed25128148b97d6bfdb21af 194.99 office_furniture 2017-11 47 479.28 16313.60 674.27 15639.33 0
In [49]:
plt.figure(figsize=(25,15))

sns.lineplot(x='yearly',y='gross_profit',
             data=price_details[price_details['product_category_name_english']\
             =='bed_bath_table'], label='bed_bath_table',color="green")
sns.lineplot(x='yearly', y='gross_profit',
             data=price_details[price_details['product_category_name_english']\
             =='health_beauty'], label='health_beauty', color="blue")
sns.lineplot(x='yearly', y='gross_profit',
             data=price_details[price_details['product_category_name_english']\
             =='computers_accessories'], label='computers_accessories', color="red")
sns.lineplot(x='yearly', y='gross_profit',
              data=price_details[price_details['product_category_name_english']\
             =='moveis_decoracao'], label='home_decoration', color="orange")
sns.lineplot(x='yearly', y='gross_profit',
             data=price_details[price_details['product_category_name_english']\
             =='furniture_decor'], label='furniture_decor', color="purple")
plt.title("Gross Profit of Top 5 Products (2016-2018)",fontweight='bold')
Out[49]:
Text(0.5, 1.0, 'Gross Profit of Top 5 Products (2016-2018)')
In [50]:
#Above graph shows yearly 'gross profit' distribution for the top 5 product category of ordered products and sellers. 
#According to data, we can say that informatic_accessories have the highest gross profit in Agust 2017 to November 2017. 
#On the other hand, the top 3 products which are bed_bath_table, health_beauty and computers_accessories, gross profits 
#are less than 200 units of currency.
In [51]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112638 entries, 0 to 112637
Data columns (total 53 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   order_id                       112638 non-null  object 
 1   order_item_id                  112638 non-null  float64
 2   product_id                     112638 non-null  object 
 3   seller_id                      112638 non-null  object 
 4   shipping_limit_date            112638 non-null  object 
 5   price                          112638 non-null  float64
 6   freight_value                  112638 non-null  float64
 7   product_name_lenght            112638 non-null  float64
 8   product_description_lenght     112638 non-null  float64
 9   product_photos_qty             112638 non-null  float64
 10  product_weight_g               112638 non-null  object 
 11  product_length_cm              112638 non-null  object 
 12  product_height_cm              112638 non-null  object 
 13  product_width_cm               112638 non-null  object 
 14  seller_zip_code_prefix         112638 non-null  float64
 15  seller_city                    112638 non-null  object 
 16  seller_state                   112638 non-null  object 
 17  customer_id                    112638 non-null  object 
 18  order_status                   112638 non-null  object 
 19  order_purchase_timestamp       112638 non-null  object 
 20  order_approved_at              112638 non-null  object 
 21  order_delivered_carrier_date   112638 non-null  object 
 22  order_delivered_customer_date  112638 non-null  object 
 23  order_estimated_delivery_date  112638 non-null  object 
 24  customer_unique_id             112638 non-null  object 
 25  customer_zip_code_prefix       112638 non-null  int64  
 26  customer_city                  112638 non-null  object 
 27  customer_state                 112638 non-null  object 
 28  review_id                      112638 non-null  object 
 29  review_score                   112638 non-null  float64
 30  review_comment_title           112638 non-null  object 
 31  review_comment_message         112638 non-null  object 
 32  review_creation_date           112638 non-null  object 
 33  review_answer_timestamp        112638 non-null  object 
 34  payment_sequential             112638 non-null  float64
 35  payment_type                   112638 non-null  object 
 36  payment_installments           112638 non-null  float64
 37  payment_value                  112638 non-null  float64
 38  product_category_name_english  112638 non-null  object 
 39  seller_lat                     112638 non-null  float64
 40  seller_lng                     112638 non-null  float64
 41  customer_lat                   112638 non-null  float64
 42  customer_lng                   112638 non-null  float64
 43  product_category               112638 non-null  object 
 44  cust_Region                    112638 non-null  object 
 45  delivered_time                 112638 non-null  object 
 46  estimate_time                  112638 non-null  object 
 47  weekly                         112638 non-null  int64  
 48  yearly                         112638 non-null  object 
 49  diff_days                      112638 non-null  int64  
 50  product_id_                    112638 non-null  object 
 51  seller_id_                     112638 non-null  object 
 52  order_week                     112638 non-null  int64  
dtypes: float64(15), int64(4), object(34)
memory usage: 45.5+ MB
In [52]:
ax = df[df['payment_type']!='not_defined']['payment_type'].value_counts().plot(kind='pie',figsize = (7,7),autopct='%.2f')
plt.title("Percent of Payment Options Used By Customers")
plt.show()
## this pie chart shows that the percent of payments options used by customers 
## mostly used payment mode is credit_card 
In [53]:
ax = df[df['payment_type'] == 'credit_card']['payment_installments'].value_counts().plot(kind='bar')
ax.set_title('payment_installments')
plt.show()
In [54]:
## the below graph shows that the customer wants more installments on the products which is costly
In [55]:
ax = sns.catplot('payment_installments','payment_value',kind='bar',
                 data  = df[df['payment_type'] == 'credit_card'])
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(

reviews data¶

In [56]:
ax = df['review_score'].value_counts().plot(kind='bar',figsize=(8,8))
ax.set_title('Reviews Count')
plt.show()
In [57]:
## this  above graphs shows the people giving stars to the produts 
## over 60000 people gave 5 stars
In [58]:
l = []
for i in range(len(df)):
    if df['review_score'][i] == 4 or df['review_score'][i] == 5:
        l.append('positive')
    elif df['review_score'][i] == 1 or df['review_score'][i] == 2:
        l.append('negative')
    else:
       l.append('neutral')
df['posorneg'] = l
ax = sns.countplot('posorneg',data = df)
ax.set_title('review Count')
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [59]:
df['review_creation_date'] = pd.to_datetime(df['review_creation_date'])
df['review_creation_date_month'] = df['review_creation_date'].dt.to_period('M')
pd.crosstab(df['review_creation_date_month'],df['review_score'])
Out[59]:
review_score 1.0 2.0 3.0 4.0 5.0
review_creation_date_month
2016-10 26 9 14 34 123
2016-11 23 4 12 21 50
2016-12 7 0 1 1 10
2017-01 28 4 29 41 194
2017-02 103 50 142 323 1034
2017-03 237 75 247 584 1592
2017-04 210 89 215 432 1271
2017-05 403 143 365 860 2395
2017-06 331 100 343 777 2279
2017-07 342 138 348 767 2380
2017-08 405 163 401 962 3272
2017-09 388 141 346 967 2865
2017-10 537 159 379 1044 3023
2017-11 559 173 518 1072 3152
2017-12 1376 374 789 1738 4788
2018-01 824 272 628 1318 3839
2018-02 863 243 653 1287 3724
2018-03 1833 373 858 1619 4260
2018-04 1260 352 719 1531 4401
2018-05 848 244 638 1733 5153
2018-06 814 267 656 1482 4722
2018-07 518 156 466 1148 4100
2018-08 872 296 714 1964 6193
In [60]:
df = df.sort_values('review_creation_date_month')
g = sns.catplot('review_creation_date_month',kind="count",col = 'review_score',data = df)
g.set_xticklabels(rotation=90)
plt.show()
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
In [61]:
## above plots clearly tells us that with time positive reviews are increasing and negative ones are decreasing
In [64]:
#CORR=df.corr()
#sns.heatmap(CORR,annot=True)
In [63]:
plt.figure(figsize=(15,15))
plt.title("product_category_name V/S freight_value")
plt.xlabel("product_category_name")
plt.ylabel("freight_value")
plt.xticks(rotation='vertical')
sns.scatterplot(df["product_category_name_english"],df["freight_value"])
## relation between product_category_name_english vs freight_value
## stationary and health_beauty has best relatioship
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
  warnings.warn(
Out[63]:
<AxesSubplot:title={'center':'product_category_name V/S freight_value'}, xlabel='product_category_name', ylabel='freight_value'>
In [65]:
actual_continuous_cols = ['price','freight_value','payment_value','product_weight_g','product_length_cm','product_height_cm','product_width_cm']
df[actual_continuous_cols].describe()
Out[65]:
price freight_value payment_value
count 112638.000000 112638.000000 112638.000000
mean 119.890695 19.994579 171.624030
std 181.346106 15.724898 264.617854
min 0.850000 0.000000 0.000000
25% 39.900000 13.072500 60.772500
50% 74.900000 16.310000 107.900000
75% 133.000000 21.180000 189.157500
max 6735.000000 409.680000 13664.080000

calculating distance between customer location and seller location using Haversine method (using latitude and longitude)¶

In [66]:
import plotly.express as px
In [68]:
!pip install geopy
Collecting geopy
  Downloading geopy-2.4.0-py3-none-any.whl (125 kB)
     -------------------------------------- 125.4/125.4 kB 2.5 MB/s eta 0:00:00
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
     ---------------------------------------- 40.3/40.3 kB ? eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.0
In [80]:
import geopy.distance

def dist_cust_seller(lat1,lon1,lat2,lon2):
    coords_1 = (lat1, lon1)
    coords_2 = (lat2, lon2)

    return geopy.distance.geodesic(coords_1, coords_2).km


df_temp = df[['customer_lat','customer_lng','seller_lat','seller_lng']]

df['distance_seller_customer'] = df_temp.apply(lambda x: dist_cust_seller(x.customer_lat,
                                                                          x.customer_lng,x.seller_lat,x.seller_lng),axis = 1)
In [72]:
sns.distplot(df['distance_seller_customer'])
C:\Users\alasy\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning:

`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).

Out[72]:
<AxesSubplot:xlabel='distance_seller_customer', ylabel='Density'>
In [101]:
## most of the customer are in ditsance between 0to 100km
In [82]:
df['distance_seller_customer'].min()
Out[82]:
0.0
In [83]:
df['distance_seller_customer'].max()
Out[83]:
8652.125673340268
In [86]:
df_distance=df.groupby(['customer_state','seller_state'])['distance_seller_customer'].mean()
In [87]:
sns.kdeplot(df_distance,color='magenta')
Out[87]:
<AxesSubplot:xlabel='distance_seller_customer', ylabel='Density'>
In [88]:
pd.crosstab(df['cust_Region'],df['product_category']).plot(kind = 'bar',stacked = True)
Out[88]:
<AxesSubplot:xlabel='cust_Region'>
In [102]:
## the above shows the products that are avaliable in each region most of the products are avaliable in southeast region
In [107]:
px.scatter_3d(df,x='cust_Region',y='product_category',z ='cust_Region',color='product_category')
In [89]:
plt.figure(figsize=(20,10))
df['product_category'].value_counts().plot.pie(autopct='%1.1f%%')
plt.title("Product Categories",size=14, weight='bold')
Out[89]:
Text(0.5, 1.0, 'Product Categories')
In [103]:
## the pie charts describes the product_categories most products from electronics and the least products are food and drinks
##the entertainment,health and beauty and houseandgarden ratios are closed in range
In [92]:
df['total_price'] = df[['price','freight_value']].sum(axis =1)
product_stats = df.groupby('product_id').agg({'order_item_id': 'sum', 'total_price': 'mean'}).reset_index()

# Plot the data
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
plt.scatter( product_stats['total_price'],product_stats['order_item_id'], alpha=0.5)
plt.ylabel('Number of Items Sold')
plt.xlabel('Total Price')
plt.title('Number of Items Sold vs. Total Price for Each Product')
plt.grid(True)
plt.show()
In [93]:
df['profit_loss'] = df['payment_value'] - df['total_price']
In [94]:
df[df['profit_loss'] == df['profit_loss'].min()][['customer_id','order_id','order_item_id',
                                                  'product_category_name_english','price','freight_value','payment_value',
                                                  'payment_type','payment_installments','order_status','review_score']]
Out[94]:
customer_id order_id order_item_id product_category_name_english price freight_value payment_value payment_type payment_installments order_status review_score
42006 eb7a157e8da9c488cd4ddc48711f1097 9de73f3e6157169ad6c32b9f313c7dcb 1.0 baby 3899.0 135.44 135.44 credit_card 1.0 delivered 5.0
In [95]:
df[df['profit_loss'] == df['profit_loss'].max()][['customer_id','order_id','order_item_id','product_category_name_english',
                                                  'price','freight_value','payment_value','payment_type',
                                                  'payment_installments','order_status','review_score','profit_loss']]
Out[95]:
customer_id order_id order_item_id product_category_name_english price freight_value payment_value payment_type payment_installments order_status review_score profit_loss
78670 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 2.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78671 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 3.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78672 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 4.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78673 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 5.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78674 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 6.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78675 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 7.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78676 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 8.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
78669 1617b1357756262bfa56ab541c47bc16 03caa2c082116e1d31e67e9ae3700499 1.0 fixed_telephony 1680.0 28.01 13664.08 credit_card 1.0 delivered 1.0 11956.07
In [96]:
px.scatter_3d(df,x='total_price',y='payment_value',z = 'profit_loss',color='product_category')
In [97]:
px.scatter_3d(df,x = 'distance_seller_customer',y = 'freight_value', z= 'payment_value',color = 'product_category')
In [98]:
actual_continuous_cols.append('distance_seller_customer')
sns.pairplot(df[actual_continuous_cols])
Out[98]:
<seaborn.axisgrid.PairGrid at 0x1c40bf91b50>
In [99]:
CORR = df[actual_continuous_cols].corr()
sns.heatmap(CORR,annot = True)
Out[99]:
<AxesSubplot:>
In [100]:
#We can see there is a high correlation between product_weight and freight_value, payment_value and price.
#Also between product dimensions.
In [ ]: